4 Laboratory sample processing
4.1 DNA extraction
4.1.1 General statistics
read_tsv("data/extraction.tsv") %>%
summarise(
max= max(extraction_total, na.rm = TRUE),
min= min(extraction_total, na.rm = TRUE),
mean= mean(extraction_total, na.rm = TRUE),
sd = sd(extraction_total, na.rm = TRUE)
) %>%
tt()| max | min | mean | sd |
|---|---|---|---|
| 7110 | 0 | 365.683 | 677.4012 |
4.1.2 Sample types
4.1.2.1 Data distribution
left_join(read_tsv("data/extraction.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
ggplot(aes(x=sample_type, y= extraction_total, fill=sample_type, color=sample_type)) +
ylim(0, 2000) +
geom_jitter(position = position_jitter(width = 0.2), alpha = 0.5, size=0.5) +
stat_halfeye(adjust = 0.5,width = 0.5, .width = 0, justification = -.55,normalize = "groups") +
scale_color_manual(values = c("#bdca50", "#AA3377")) +
scale_fill_manual(values = c("#bdca5050", "#AA337750")) +
labs(y="Density",x="DNA yield", fill="Sample type", color="Sample type") +
theme_classic()4.1.2.2 Comparison
left_join(read_tsv("data/extraction.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
mutate(extraction_total = ifelse(extraction_total == 0, 0.00001, extraction_total)) %>%
mutate(extraction_total = log(extraction_total)) %>%
select(extraction_total,sample_type) %>%
mutate(sample_type = factor(sample_type)) %>%
wilcox.test(extraction_total ~ sample_type,data=.) %>%
tidy()# A tibble: 1 × 4
statistic p.value method alternative
<dbl> <dbl> <chr> <chr>
1 335473 3.41e-41 Wilcoxon rank sum test with continuity correction two.sided
left_join(read_tsv("data/extraction.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
group_by(sample_type) %>%
summarise(
mean= mean(extraction_total, na.rm = TRUE),
sd = sd(extraction_total, na.rm = TRUE)) %>%
tt()| sample_type | mean | sd |
|---|---|---|
| Anal/cloacal swab | 170.2285 | 362.7389 |
| Faecal | 316.6666 | 423.6434 |
4.1.3 Taxonomy
4.1.3.1 Data distribution
left_join(read_tsv("data/extraction.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>%
ggplot(aes(y=extraction_total,x=tax_group,color=tax_group,fill=tax_group)) +
ylim(0, 2000) +
geom_jitter(position = position_jitter(width = 0.2), alpha = 0.5, size=0.5) +
stat_halfeye(adjust = 0.5,width = 0.5, .width = 0, justification = -.55,normalize = "groups") +
scale_color_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
labs(y="DNA yield",x="Taxonomic group", color="Sample type") +
theme_classic()4.1.3.2 Comparison
left_join(read_tsv("data/extraction.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
mutate(extraction_total = ifelse(extraction_total == 0, 0.00001, extraction_total)) %>%
mutate(extraction_total = log(extraction_total)) %>%
select(extraction_total,tax_group) %>%
mutate(sample_type = factor(tax_group)) %>%
kruskal.test(extraction_total ~ tax_group,data=.)
Kruskal-Wallis rank sum test
data: extraction_total by tax_group
Kruskal-Wallis chi-squared = 946.6, df = 4, p-value < 2.2e-16
left_join(read_tsv("data/extraction.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
group_by(tax_group) %>%
summarise(
mean= mean(extraction_total, na.rm = TRUE),
sd = sd(extraction_total, na.rm = TRUE)) %>%
tt()| tax_group | mean | sd |
|---|---|---|
| Amphibians | 233.35078 | 358.90219 |
| Bats | 59.60574 | 92.00357 |
| Birds | 74.43550 | 262.34517 |
| Mammals | 463.93202 | 471.41487 |
| Reptiles | 394.50619 | 437.86229 |
4.2 Sequencing library preparation
4.2.1 Overall
left_join(read_tsv("data/library.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
filter(library_PCR_cycles_required > 0) %>%
summarise(
input=mean(library_input_dna, na.rm = TRUE),
max= max(library_PCR_cycles_required, na.rm = TRUE),
min= min(library_PCR_cycles_required, na.rm = TRUE),
mean= mean(library_PCR_cycles_required, na.rm = TRUE),
sd = sd(library_PCR_cycles_required, na.rm = TRUE)) %>%
tt()| input | max | min | mean | sd |
|---|---|---|---|---|
| 89.34565 | 33 | 2 | 9.150046 | 4.390537 |
left_join(read_tsv("data/library.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
lm(library_PCR_cycles_required ~ library_input_dna, data = .) %>%
tidy()# A tibble: 2 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 11.6 0.110 105. 0
2 library_input_dna -0.0284 0.000901 -31.6 1.11e-183
left_join(read_tsv("data/library.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
lm(library_PCR_cycles_required ~ library_input_dna * sample_type, data = .) %>%
anova() %>%
tidy()# A tibble: 4 × 6
term df sumsq meansq statistic p.value
<chr> <int> <dbl> <dbl> <dbl> <dbl>
1 library_input_dna 1 14071. 14071. 1071. 4.30e-195
2 sample_type 1 2439. 2439. 186. 8.48e- 41
3 library_input_dna:sample_type 1 0.0500 0.0500 0.00381 9.51e- 1
4 Residuals 2441 32067. 13.1 NA NA
left_join(read_tsv("data/library.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
lm(library_PCR_cycles_required ~ library_input_dna * tax_group, data = .) %>%
anova() %>%
tidy()# A tibble: 4 × 6
term df sumsq meansq statistic p.value
<chr> <int> <dbl> <dbl> <dbl> <dbl>
1 library_input_dna 1 14071. 14071. 1145. 5.17e-206
2 tax_group 4 4140. 1035. 84.2 4.74e- 67
3 library_input_dna:tax_group 4 435. 109. 8.85 4.32e- 7
4 Residuals 2435 29931. 12.3 NA NA
4.2.2 Sample types
left_join(read_tsv("data/library.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
filter(library_input_dna <= 200) %>%
filter(library_PCR_cycles_required > 0) %>%
ggplot(aes(y=library_PCR_cycles_required, x=library_input_dna, color=sample_type, fill=sample_type, group=sample_type)) +
geom_point(alpha=0.5, size=0.5) +
stat_smooth(method = "lm", formula = y ~ x, geom = "smooth") +
scale_color_manual(values = c("#bdca50", "#AA3377")) +
scale_fill_manual(values = c("#bdca5050", "#AA337750")) +
theme_classic() +
labs(y="Required number of cycles",x="Amount of inputted DNA (ng)", color="Sample type") +
theme_classic()4.2.2.1 Comparisons
Relationship between the amount of inputted DNA (ng) and the required number of cycles.
left_join(read_tsv("data/library.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
filter(library_PCR_cycles_required>0 & library_input_dna>0) %>%
lme(library_PCR_cycles_required ~ library_input_dna, random=~1 | sample_type, data = .) %>%
broom.mixed::tidy() %>%
tt()| effect | group | term | estimate | std.error | df | statistic | p.value |
|---|---|---|---|---|---|---|---|
| fixed | NA | (Intercept) | 11.87417166 | 1.1716203608 | 2330 | 10.13483 | 1.187767e-23 |
| fixed | NA | library_input_dna | -0.02501844 | 0.0008256453 | 2330 | -30.30168 | 2.507418e-170 |
| ran_pars | sample_type | sd_(Intercept) | 1.64945139 | NA | NA | NA | NA |
| ran_pars | Residual | sd_Observation | 3.30432885 | NA | NA | NA | NA |
Comparison between slopes.
left_join(read_tsv("data/library.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
lm(library_PCR_cycles_required ~ library_input_dna * sample_type, data = .) %>%
broom::tidy() %>%
tt()| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 13.3047246163 | 0.187031107 | 71.13642643 | 0.000000e+00 |
| library_input_dna | -0.0260524410 | 0.001812871 | -14.37082274 | 5.140847e-45 |
| sample_typeFaecal | -2.4762116244 | 0.227612093 | -10.87908638 | 5.960119e-27 |
| library_input_dna:sample_typeFaecal | 0.0001282468 | 0.002078874 | 0.06169052 | 9.508143e-01 |
4.2.3 Taxonomy
left_join(read_tsv("data/library.tsv"),
read_tsv("data/sample.tsv"),
by="sample_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
filter(library_input_dna <= 200) %>%
filter(library_PCR_cycles_required > 0) %>%
mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>%
ggplot(., aes(y=library_PCR_cycles_required, x=library_input_dna, color=tax_group, group=tax_group)) +
geom_point(alpha=0.5, size=0.5) +
stat_smooth(aes(fill = tax_group), method = "lm", formula = y ~ x, geom = "smooth") +
scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
theme_classic() +
labs(y="Required number of cycles",x="Amount of inputted DNA (ng)", color="Sample type", fill="Sample type") +
theme_classic()4.3 Data quality
left_join(read_tsv("data/preprocessing.tsv"),read_tsv("data/sample.tsv"),by="sample_id") %>%
left_join(read_tsv("data/sequence.tsv"),by="sequence_id") %>%
left_join(read_tsv("data/index.tsv"),by="index_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
mutate(lowqual_perc_reads=(1-reads_post_fastp/reads_pre_fastp)*100) %>%
mutate(lowqual_perc_bases=(1-bases_post_fastp/bases_pre_fastp)*100) %>%
#Plot map EE6677 < bats
ggplot(aes(y=lowqual_perc_bases, x=index_PCR_cycles_given, colour=sample_type, fill=sample_type, group=sample_type)) +
geom_jitter(alpha=0.3) +
stat_smooth(method = "lm", formula = y ~ x, geom = "smooth", alpha=0.2) +
scale_color_manual(values = c("#bdca50", "#AA3377")) +
scale_fill_manual(values = c("#bdca5080", "#AA337780")) +
scale_x_log10() +
theme_classic()left_join(read_tsv("data/preprocessing.tsv"),read_tsv("data/sample.tsv"),by="sample_id") %>%
left_join(read_tsv("data/sequence.tsv"),by="sequence_id") %>%
left_join(read_tsv("data/index.tsv"),by="index_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
mutate(lowqual_perc_reads=(1-reads_post_fastp/reads_pre_fastp)*100) %>%
mutate(lowqual_perc_bases=(1-bases_post_fastp/bases_pre_fastp)*100) %>%
mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>%
ggplot(aes(y=lowqual_perc_bases, x=index_PCR_cycles_given, colour=tax_group, fill=tax_group, group=tax_group)) +
geom_point(alpha=0.3) +
stat_smooth(method = "lm", formula = y ~ x, geom = "smooth", alpha=0.2) +
scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
scale_x_log10() +
theme_classic() +
theme(legend.position = "bottom")4.4 Data duplication
left_join(read_tsv("data/preprocessing.tsv"),read_tsv("data/sample.tsv"),by="sample_id") %>%
left_join(read_tsv("data/sequence.tsv"),by="sequence_id") %>%
left_join(read_tsv("data/index.tsv"),by="index_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
mutate(lowqual_perc_reads=(1-reads_post_fastp/reads_pre_fastp)*100) %>%
mutate(lowqual_perc_bases=(1-bases_post_fastp/bases_pre_fastp)*100) %>%
#Plot map EE6677 < bats
ggplot(aes(y=host_duplicate_fraction, x=index_PCR_cycles_given, colour=sample_type, fill=sample_type, group=sample_type)) +
geom_jitter(alpha=0.3) +
stat_smooth(method = "lm", formula = y ~ x, geom = "smooth", alpha=0.2) +
scale_color_manual(values = c("#bdca50", "#AA3377")) +
scale_fill_manual(values = c("#bdca5080", "#AA337780")) +
scale_x_log10() +
theme_classic()left_join(read_tsv("data/preprocessing.tsv"),read_tsv("data/sample.tsv"),by="sample_id") %>%
left_join(read_tsv("data/sequence.tsv"),by="sequence_id") %>%
left_join(read_tsv("data/index.tsv"),by="index_id") %>%
filter(sample_type %in% c("Faecal", "Anal/cloacal swab")) %>%
mutate(lowqual_perc_reads=(1-reads_post_fastp/reads_pre_fastp)*100) %>%
mutate(lowqual_perc_bases=(1-bases_post_fastp/bases_pre_fastp)*100) %>%
mutate(tax_group=factor(tax_group,levels=c("Amphibians","Reptiles","Birds","Bats","Mammals"))) %>%
ggplot(aes(y=host_duplicate_fraction, x=index_PCR_cycles_given, colour=tax_group, fill=tax_group, group=tax_group)) +
geom_point(alpha=0.3) +
stat_smooth(method = "lm", formula = y ~ x, geom = "smooth", alpha=0.2) +
scale_color_manual(values = c("#228833","#EE6677","#CCBB44","#66CCEE","#4477AA")) +
scale_fill_manual(values = c("#22883380","#EE667780","#CCBB4480","#66CCEE80","#4477AA80")) +
scale_x_log10() +
theme_classic() +
theme(legend.position = "bottom")